To explore the characteristics the eLife data. It is structured into two major parts:
import os, glob, random, scipy, csv, requests,pycountry
import geopandas as gpd
import pandas as pd
import numpy as np
from genderize import Genderize
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
pd.set_option('mode.chained_assignment', None)
%matplotlib inline
%config InlineBackend.figure_format='retina'
plt.rcParams["figure.figsize"] = (10,10)
plt.rcParams["font.family"] = "serif"
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams["font.style"] = "normal"
plt.rcParams["axes.labelcolor"] = "gray"
plt.rcParams["text.color"] = "grey"
# Code Directory
WD = os.getcwd() + '/'
# Data Directories
# Review
RD = os.path.abspath("..") + '/00_data/'
# Consulation
CD = os.path.abspath("../..") + "/01_consultations/00_data/"
# elife Data
ELIFE = '/home/jupyter/eLife_Raw_Data/'
Initial consultations — when an editor asks another to vet a submission — come in 10 files. These files were messy (with comment fields erroneously spliced, etc.), so I had to clean and merge them (see merge_consultations notebook).
init_consult_file = glob.glob(CD+"*Initial*ALL.csv")[0]
init_consults = pd.read_csv(init_consult_file)
init_consults.shape
(256937, 6)
init_consults['Manuscript no.'].nunique()
46801
for v in init_consults.columns:
print(v)
Manuscript no. session_title Comment date Comment text Commenter Commenter ID
min(init_consults['Comment date'])
' ""'
These are shorter than reviewer consultation (below). These don't include the editorial letter draft, which editors often copy and paste into the reviewer consultation exchange.
def get_comment_lens(s):
s = s.split()
return (len(s))
init_consults["lengths"] = init_consults['Comment text'].apply(get_comment_lens)
print("-"*40)
print("Initial Consultation Comment Lengths")
print("-"*40)
pd.DataFrame(round(init_consults['lengths'].describe(), 2))
---------------------------------------- Initial Consultation Comment Lengths ----------------------------------------
| lengths | |
|---|---|
| count | 256937.00 |
| mean | 70.44 |
| std | 106.77 |
| min | 1.00 |
| 25% | 19.00 |
| 50% | 41.00 |
| 75% | 84.00 |
| max | 7083.00 |
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(init_consults['lengths'],
bins=100,
range=[0,1000],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Tokens per Initial Consultation Comment',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N Initial\nConsultation\nComments',
fontsize=font_size,
rotation=0,
labelpad=105,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
color=tick_color,
weight=font_weight,
rotation = 60)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
# Tabulate
print("-"*33)
print("N CONSULTATION COMMENTS PER MS:")
print("-"*33)
pd.DataFrame(round(init_consults.groupby("Manuscript no.")['Comment date'].nunique().describe(percentiles=[.25,.5, .75, .95, .99]), 2))
--------------------------------- N CONSULTATION COMMENTS PER MS: ---------------------------------
| Comment date | |
|---|---|
| count | 46801.00 |
| mean | 5.49 |
| std | 3.53 |
| min | 1.00 |
| 25% | 3.00 |
| 50% | 5.00 |
| 75% | 7.00 |
| 95% | 12.00 |
| 99% | 18.00 |
| max | 130.00 |
# Plot
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(init_consults.groupby("Manuscript no.")['Comment date'].count(),
bins=20,
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Initial Consultation Comments per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
Zoomed into to a smaller range of n comments:
# Plot
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(init_consults.groupby("Manuscript no.")['Comment date'].count(),
bins=20,
range=[0,20],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Initial Consultation Comments per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
ticks=list(range(0,20,2)),
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
# Tabulate
print("-"*33)
print("N CONSULTING EDITORS PER MS:")
print("-"*33)
pd.DataFrame(round(init_consults.groupby("Manuscript no.")['Commenter ID'].nunique().describe(percentiles=[.25,.5, .75, .95, .99]), 2))
--------------------------------- N CONSULTING EDITORS PER MS: ---------------------------------
| Commenter ID | |
|---|---|
| count | 46801.00 |
| mean | 2.98 |
| std | 1.12 |
| min | 1.00 |
| 25% | 2.00 |
| 50% | 3.00 |
| 75% | 4.00 |
| 95% | 5.00 |
| 99% | 6.00 |
| max | 15.00 |
# Plot
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(init_consults.groupby("Manuscript no.")['Comment date'].nunique(),
bins=15,
range=[0,15],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Consulting Editors per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
ticks=list(range(0,15,3)),
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
... Which would imply no consultation. So these might need to be explored further.
n_editors = pd.DataFrame(init_consults.groupby("Manuscript no.")['Commenter ID'].nunique())
n_editors[n_editors["Commenter ID"] == 1].sum() # note the columnn name is inherited; it is actually n editors
Commenter ID 1318 dtype: int64
The reviewer consultation data comes in 10 files, 2012–2022, post-review. See merge_consultations notebook for how I cleaned and merged these files. There are ~144 K consultation threads with 5 features.
rev_consults_file = glob.glob(CD+"*Reviewer*ALL.csv")[0]
rev_consults = pd.read_csv(rev_consults_file)
for v in rev_consults.columns:
print(v)
Manuscript no. Comment date Comment text Commenter Commenter ID
min(rev_consults['Comment date'])
'2012-06-21 15:20:30.143'
rev_consults.shape
(144889, 5)
This number accords with the number of manuscripts sent out for review.
print("{} post-review manuscripts have consultation comments".format(rev_consults['Manuscript no.'].nunique()))
18546 post-review manuscripts have consultation comments
def get_comment_lens(s):
s = s.split()
return (len(s))
rev_consults["lengths"] = rev_consults['Comment text'].apply(get_comment_lens)
print("-"*40)
print("Reviewer Consultation Comment Lengths")
print("-"*40)
pd.DataFrame(round(rev_consults['lengths'].describe(), 2))
---------------------------------------- Reviewer Consultation Comment Lengths ----------------------------------------
| lengths | |
|---|---|
| count | 144889.00 |
| mean | 119.97 |
| std | 252.32 |
| min | 1.00 |
| 25% | 25.00 |
| 50% | 63.00 |
| 75% | 120.00 |
| max | 13052.00 |
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(rev_consults['lengths'],
bins=100,
range=[0,1000],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Tokens per Consultation Comment',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N Reviewer\nConsultation\nComments',
fontsize=font_size,
rotation=0,
labelpad=105,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
color=tick_color,
weight=font_weight,
rotation = 60)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
# Tabulate
print("-"*33)
print("N CONSULTATION COMMENTS PER MS:")
print("-"*33)
pd.DataFrame(round(rev_consults.groupby("Manuscript no.")['Comment date'].nunique().describe(percentiles=[.25,.5, .75, .95, .99]), 2))
--------------------------------- N CONSULTATION COMMENTS PER MS: ---------------------------------
| Comment date | |
|---|---|
| count | 18546.00 |
| mean | 7.81 |
| std | 4.64 |
| min | 1.00 |
| 25% | 5.00 |
| 50% | 7.00 |
| 75% | 10.00 |
| 95% | 17.00 |
| 99% | 23.00 |
| max | 52.00 |
# Plot
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(rev_consults.groupby("Manuscript no.")['Comment date'].nunique(),
bins=20,
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Reviewer Consultation\nComments per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
... This is one less than the mean, because the editors are counted in these numbers, too.
# Tabulate
print("-"*33)
print("N CONSULTING EDITORS PER MS:")
print("-"*33)
pd.DataFrame(round(rev_consults.groupby("Manuscript no.")['Commenter ID'].nunique().describe(percentiles=[.25,.5, .75, .95, .99]), 2))
--------------------------------- N CONSULTING EDITORS PER MS: ---------------------------------
| Commenter ID | |
|---|---|
| count | 18546.00 |
| mean | 3.51 |
| std | 1.06 |
| min | 1.00 |
| 25% | 3.00 |
| 50% | 4.00 |
| 75% | 4.00 |
| 95% | 5.00 |
| 99% | 6.00 |
| max | 9.00 |
# Plot
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(rev_consults.groupby("Manuscript no.")['Comment date'].nunique(),
bins=15,
range=[0,15],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Consulting Reviewers per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
ticks=list(range(0,15,3)),
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
The eLife review data contains only information about the reviews themselves. There are ~50K reviews (with manuscripts often recieving more than 1 round of ≥1 review) and 7 variables.
reviews = pd.read_csv(RD+'classified_elife_reviews.csv')
reviews.shape
(43372, 7)
Here're the columns in the data. The ratings are predicted using roBERTa. See "classify_eLife" notebook for those procedures.
for v in reviews.columns:
print(v)
Manuscript no. Reviewer ID Major comments Minor comments Data comments Competing interests rating_hat
# Tabulate
print("-"*33)
print("N REVIEWS PER MS:")
print("-"*33)
print(round(reviews.groupby("Manuscript no.")['Reviewer ID'].count().describe(percentiles=[.5, .75, .95, .99]), 2))
print("-"*33)
--------------------------------- N REVIEWS PER MS: --------------------------------- count 15585.00 mean 2.78 std 0.54 min 1.00 50% 3.00 75% 3.00 95% 3.00 99% 4.00 max 19.00 Name: Reviewer ID, dtype: float64 ---------------------------------
# Plot
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(reviews.groupby("Manuscript no.")['Reviewer ID'].count(),
bins=4,
range=[0.5,4.5],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Reviews per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(ticks=[1,2,3,4],
fontsize=25,
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
# Tabulate
print("-"*37)
print("N REVIEWS PER REVIEWER:")
print("-"*37)
print(round(reviews.groupby("Reviewer ID").count()['Manuscript no.'].describe()))
print("-"*37)
------------------------------------- N REVIEWS PER REVIEWER: ------------------------------------- count 19223.0 mean 2.0 std 4.0 min 1.0 25% 1.0 50% 1.0 75% 2.0 max 112.0 Name: Manuscript no., dtype: float64 -------------------------------------
# Plot
# Plot globals
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
# Plot data
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(reviews.groupby("Reviewer ID").count()['Manuscript no.'],
bins=80,
range=[0,80],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
# Style
plt.xlabel('N Reviews per Reviewer',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nReviewers',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
color=tick_color,
weight=font_weight,
rotation=60)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
print("-"*40)
print("REVIEW LENGTH (n tokens):")
print("-"*40)
def get_comment_lens(s):
s = s.split()
return (len(s))
reviews['lengths'] = reviews['Major comments'].apply(get_comment_lens)
print(round(reviews['lengths'].describe(), 2))
print("-"*40)
---------------------------------------- REVIEW LENGTH (n tokens): ---------------------------------------- count 43372.00 mean 491.59 std 303.12 min 1.00 25% 292.00 50% 441.00 75% 617.25 max 6695.00 Name: lengths, dtype: float64 ----------------------------------------
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(reviews['lengths'],
bins=100,
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Tokens per Review',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nReviews',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
color=tick_color,
weight=font_weight,
rotation = 60)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
Zoomed in at the beginning of the range:
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(reviews['lengths'],
bins=10,
range=[0,10],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('N Tokens per Review',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nReviews',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
ticks=list(range(0,10,1)),
color=tick_color,
weight=font_weight,
rotation = 60)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
I pretrained roBERTa on ~28K ICLR reviews. Their ratings were 0-5, in the same ordinal direction (=5 outstanding)as the labels I used for elife. Then, I pretrained on my eLife labels, which were 1-4. Finally, I predicted all of eLife's reviews. The final the model produced labels for eLife that were just below and just above my hand label range. I thought about rescaling ICLR but I didn't want to lose any nuanced meaning in that data. But I didn't think the difference in the two scales' magnitudes would be all that important; the theory was that the relationships between the texts and the rating scales in either was consistent and generalizable to both.
# Standardize predicted recommendation scores
reviews['std_rating_hat'] = (reviews['rating_hat'] - np.mean(reviews['rating_hat'])) / np.std(reviews['rating_hat'])
print("-"*40)
print("PREDICTED REVIEW RATING")
print("-"*40)
print(round(reviews['rating_hat'].describe(), 2))
print("-"*40)
---------------------------------------- PREDICTED REVIEW RATING ---------------------------------------- count 43372.00 mean 2.66 std 0.69 min 0.63 25% 2.20 50% 2.73 75% 3.13 max 4.56 Name: rating_hat, dtype: float64 ----------------------------------------
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(reviews['rating_hat'],
bins=25,
range=[0,5],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('Predicted Reviewer Ratings',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nReviews',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
The manuscript data contains all manuscripts, reviewed and desk rejected. There are about 61 K manuscripts with 25 features.
manuscript_file = glob.glob(ELIFE+"*history*")[0]
manuscripts = pd.read_csv(manuscript_file)
manuscripts.shape
(60899, 25)
Here are the variables in the manuscript data. *_dt corresponds to "date".
for v in manuscripts.columns:
print(v)
ms type country senior_editor initial_qc_dt initial_decision initial_decision_dt reviewing_editor full_qc_dt full_decision full_decision_dt rev1_qc_dt rev1_decision rev1_decision_dt rev2_qc_dt rev2_decision rev2_decision_dt rev3_qc_dt rev3_decision rev3_decision_dt rev4_qc_dt rev4_decision rev4_decision_dt p.poa_dt p.vor_dt
Random fake rows in the dataframe.
print(manuscripts['ms'].isna().sum())
manuscripts[manuscripts['ms'].isna()]
688
| ms | type | country | senior_editor | initial_qc_dt | initial_decision | initial_decision_dt | reviewing_editor | full_qc_dt | full_decision | ... | rev2_decision | rev2_decision_dt | rev3_qc_dt | rev3_decision | rev3_decision_dt | rev4_qc_dt | rev4_decision | rev4_decision_dt | p.poa_dt | p.vor_dt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 60211 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60212 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60213 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60214 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60215 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 60894 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60895 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60896 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60897 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 60898 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
688 rows × 25 columns
# Drop if paper is missing all information
manuscripts = manuscripts.dropna(how="any", subset=['ms'])
print("-"*41)
print("{} manuscripts were submitted to eLife".format(manuscripts.shape[0]))
print("-"*41)
----------------------------------------- 60211 manuscripts were submitted to eLife -----------------------------------------
print("-"*39)
print("Manuscripts at Initial / Desk Decision:")
print("-"*39)
print(manuscripts['initial_decision'].value_counts())
print()
print("Total manuscripts with `initial_decision`: {}".format(manuscripts['initial_decision'].value_counts().sum()))
print("-"*48)
--------------------------------------- Manuscripts at Initial / Desk Decision: --------------------------------------- Reject Initial Submission 40961 Encourage Full Submission 18010 Simple Withdraw 197 Reject Full Submission 1 Name: initial_decision, dtype: int64 Total manuscripts with `initial_decision`: 59169 ------------------------------------------------
print("-"*44)
print("Manuscripts with Full Decision after review:")
print("-"*44)
print(manuscripts['full_decision'].value_counts())
print()
print("Total manuscripts with `full_decision`: {}".format(manuscripts['full_decision'].value_counts().sum()))
print("-"*45)
-------------------------------------------- Manuscripts with Full Decision after review: -------------------------------------------- Revise Full Submission 10728 Reject Full Submission 8068 Accept Full Submission 237 Simple Withdraw 78 Name: full_decision, dtype: int64 Total manuscripts with `full_decision`: 19111 ---------------------------------------------
total_manuscripts_with_reviews = 0
for review_x in range(1,5):
print("-"*43)
print("Manuscripts with Decision after Round # {}:".format(review_x))
print("-"*43)
print(manuscripts['rev{}_decision'.format(review_x)].value_counts())
print("-"*43, '\n\n')
total_manuscripts_with_reviews += manuscripts['rev{}_decision'.format(review_x)].value_counts().sum()
print("-"*55)
print("Total number of manuscripts with `revX_decision`: {}".format(total_manuscripts_with_reviews))
print("-"*55)
------------------------------------------- Manuscripts with Decision after Round # 1: ------------------------------------------- Accept Full Submission 7130 Revise Full Submission 2904 Reject Full Submission 1104 Simple Withdraw 30 Name: rev1_decision, dtype: int64 ------------------------------------------- ------------------------------------------- Manuscripts with Decision after Round # 2: ------------------------------------------- Accept Full Submission 2445 Revise Full Submission 353 Reject Full Submission 136 Simple Withdraw 3 Name: rev2_decision, dtype: int64 ------------------------------------------- ------------------------------------------- Manuscripts with Decision after Round # 3: ------------------------------------------- Accept Full Submission 312 Revise Full Submission 39 Reject Full Submission 19 Simple Withdraw 1 Name: rev3_decision, dtype: int64 ------------------------------------------- ------------------------------------------- Manuscripts with Decision after Round # 4: ------------------------------------------- Accept Full Submission 34 Reject Full Submission 2 Revise Full Submission 2 Name: rev4_decision, dtype: int64 ------------------------------------------- ------------------------------------------------------- Total number of manuscripts with `revX_decision`: 14514 -------------------------------------------------------
reviewed_ms = manuscripts.dropna(how="all", subset=["full_decision", 'rev1_decision', 'rev2_decision','rev3_decision', 'rev4_decision'])
# Rename ms
reviewed_ms.ms = reviewed_ms.ms.astype(int)
reviewed_ms["Manuscript no."] = reviewed_ms['ms']
reviewed_ms.shape
(19126, 26)
# Across the decision columns, grab the last non-missing one and post it as the final:
reviewed_ms["final_decision"] = reviewed_ms.apply(lambda x: list(x[["full_decision", 'rev1_decision',
'rev2_decision','rev3_decision',
'rev4_decision']].dropna())[-1], axis=1)
# Dichotomize the final decision, 1=accept
reviewed_ms["final_accept"] = np.where(reviewed_ms["final_decision"] == "Accept Full Submission", 1, 0)
print("-"*35)
print("FINAL ACCEPTANCES (=1)")
print("Total Manuscripts Reviewed = {}".format(reviewed_ms.shape[0]))
print("-"*35)
print(round(reviewed_ms["final_accept"].value_counts(normalize=True),2))
print("-"*35)
----------------------------------- FINAL ACCEPTANCES (=1) Total Manuscripts Reviewed = 19126 ----------------------------------- 1 0.53 0 0.47 Name: final_accept, dtype: float64 -----------------------------------
The reviewer data contains all demographic info on the reviewers. There are about 53K reviewers and 8 columns. If elife editors are acting as reviewers (and not managing the review of an MS), they are indistinguishable as editors from reviewers using Reviewer ID. They can be differentiated by name and institution, though.
reviewers = pd.read_csv(ELIFE+"eLife_Reviewers.csv", header=2)
reviewers.shape
(53688, 8)
Here are the features we have on reviewers.
for v in reviewers.columns:
print(v)
Manuscript no. Reviewer no. Reviewer ID Reviewer name Reviewer email Reviewer institution First subject area Second subject area
The reviewer data doesn't come with reviewers' countries. To infer where they're at, we take their email TLDs and reverse-look up the country.
def get_domain(s):
"""
Takes email, returns TLD.
"""
email = str(s).split(".")
return email[-1].lower()
Import country using a {country -> TLD} mapping downladed from github.
tld_dct = pd.read_csv("https://gist.githubusercontent.com/derlin/421d2bb55018a1538271227ff6b1299d/raw/3a131d47ca322a1d001f1f79333d924672194f36/country-codes-tlds.csv")
tld_dct[' tld'] = tld_dct[' tld'].apply(lambda x: x[-3:])
tld_dct[' tld'] = tld_dct[' tld'].apply(lambda x: x.strip("."))
tld_dct = dict(zip(tld_dct[' tld'], tld_dct['country']))
tld_dct['edu'] = "United States"
tld_dct['gov'] = "United States"
def get_country(s, tld_dct):
"""
Takes TLD, returns country.
"""
if s in tld_dct:
country = tld_dct[s].split("(")
country = country[0]
country = country.strip()
return country
else:
return "unknown"
def get_anglo(s):
"""
Takes country, returns 1 if a commonwealth.
"""
anglo = ['Australia', 'Canada', 'India',
'New Zealand', 'Pakistan', 'South Africa',
'Sri Lanka', 'United Kingdom', 'United States']
if s in anglo:
return 1
else:
return 0
Grab domain, infer country, determine if in commonwealth
reviewers["r_domain"] = reviewers['Reviewer email'].apply(get_domain)
reviewers['r_country'] = reviewers['r_domain'].apply(get_country, args=(tld_dct,))
reviewers["r_commonwealth"] = reviewers['r_country'].apply(get_anglo)
Author data contains all demographic information on all submitting authors, including co-authors. In the analyses below, I summarize all authors and then just the "corresponding authors" — these are not necessarily the first author on the paper.
'''
The author csv file has some sloppy lines, particularly when
institution and department names span multiple lines and/or
contain double quotation marks. This loop cleans them up.
'''
cleaned_rows = []
with open(ELIFE+"eLife_Authors.csv", 'r') as f:
reader = csv.reader(f)
for row in reader:
# There should be 11 fields; if more, then
# the institution / department is spliced.
# Solution: skip these rows' institutions for now
if len(row) > 11:
ms_no = eval(row[0])
auth_no = row[1]
auth_id = row[2]
auth_type = row[3]
auth_dual = row[4]
auth_name = row[5]
dept = ""
inst = ""
city = row[-3]
country = row[-2]
email = row[-1]
cleaned_rows.append([ms_no, auth_no, auth_type,
auth_dual, auth_name,
dept, inst, # empty
city, country, email])
else:
cleaned_rows.append(row)
authors = pd.DataFrame(cleaned_rows[5:], columns=cleaned_rows[3])
authors.shape
(163295, 11)
Here are the features we have on authors. Note that country is input by authors themselves and varies dramatically (i.e. US, USA, the US, United States). This gets a bit tricky down the line.
for v in authors.columns:
print(v)
Manuscript no. Author number Author ID Author type Dual corresponding author? Author name Department Institution City Country Author email
authors["a_commonwealth"] = authors["Country"].apply(get_anglo)
Here I look at country of origin (i.e. where authors write from) and conventional gender identity of the name authors write under.
# get unique n of countries represented by each auth on an ms
authors = authors.join(authors.groupby("Manuscript no.")["Country"].nunique(), on="Manuscript no.", rsuffix="_n")
# get unique n of authors on each ms
authors = authors.join(authors.groupby("Manuscript no.")["Author ID"].nunique(), on="Manuscript no.", rsuffix="_n")
country diversity¶I compute the country_diversity as the ratio of authors from unique countries over the total number of authors. So, if there are 4 authors and 3 come from different countries, diversity = 3/4. In single-author papers, the ratio would be 1/1, so I hard code these as 0 (i.e. no diversity).
# Compute `country_diversity`
authors['country_diversity'] = round(authors['Country_n'] / authors['Author ID_n'],3)
authors['country_diversity'] = np.where(authors['Country_n']==1, 0, authors['country_diversity'])
def get_region(s):
"""
Takes country, looks it up at an API,
Returns world region ("Americas", "Asia", etc.)
"""
s = str(s)
s = s.strip()
s = s.split()
# these get in the way of simple looks ups;
# with this, look ups return null results.
fillers = ['United', "Republic", "of", "State", "the", "Arab",
"Democratic", "South", "French", "El", "Islamic"]
if len(s) > 1:
if ("United" and "Kingdom") in s:
country = "britain"
elif ("United" and "States") in s:
country = "usa"
else:
s = [word for word in s if word not in fillers]
country = "?".join(s)
elif len(s) == 1:
country = s[0]
else:
country = -99
r = requests.get('https://restcountries.com/v3.1/name/{}?fields=region'.format(country))
if r.status_code == 200:
if country == "Iran":
return "Asia"
elif country == "Romania":
return "Europe"
else:
return r.json()[0]['region']
else:
return "unknown"
So we don't inefficiently look up a country repeatedly (which is slow with requests, I create dict containing 1 country as the key and its world region as the values. Then I use this dictionary to populate the region field on the original dataframe.
# Create dict of unique countries; look region up.
countries = authors['Country'].tolist()
countries.extend(reviewers['r_country'].tolist())
countries = set(countries)
countries = dict.fromkeys(countries, None)
for country in countries:
countries[country] = get_region(country)
Here we construct indicators of the world region and whether contributor is based in Americas/Asia.
authors['a_region'] = authors['Country'].apply(lambda x: countries[x])
authors['a_asia'] = np.where((authors['a_region'] == "Asia"), 1, 0)
authors['a_america'] = np.where((authors['a_region'] == "Americas"), 1, 0)
reviewers['r_region'] = reviewers['r_country'].apply(lambda x: countries[x])
reviewers['r_asia'] = np.where((reviewers['r_region'] == "Asia"), 1, 0)
reviewers['r_america'] = np.where((reviewers['r_region'] == "Americas"), 1, 0)
With the first names, we use genderize.io to classify the names' conventional genders.
def get_first(name):
"""
Takes full name;
Returns first name.
"""
name = str(name)
name = name.strip()
name = name.split()
if len(name) > 0:
if "&" in name[0]:
name = name[0].split("&")
return name[0]
else:
return ""
# Grab first names and put into dict
names = {}
for name in authors["Author name"]:
name = get_first(name)
if name not in names:
names[name] = None
for name in reviewers['Reviewer name']:
name = get_first(name)
if name not in names:
names[name] = None
Un-comment to re-do gender classification (takes over an hour):
# genderize = Genderize(
# api_key='6d0fd56cd59d57982ca9e6612b9c16e3')
# with open(DD+"name_genders.csv", "w") as f:
# writer = csv.writer(f)
# writer.writerow(["name", "gender", "probability"])
# for name in names.keys():
# sex = genderize.get([name])[0]['gender']
# prob = genderize.get([name])[0]['probability']
# if sex == "female":
# gender = "woman"
# else:
# gender = "man"
# writer.writerow([name, gender, prob])
Here, I compute a gender spectrum based on the first name.
# Read-in saved gender names data.
gender_names = pd.read_csv(RD+"name_genders.csv")
# Create prob of woman as inverse of prob of man.
gender_names['gender'] = np.where((gender_names['probability'] == 0), "unknown", gender_names['gender'])
gender_names['prob_woman'] = np.where((gender_names['gender'] == "man"), 1-gender_names['probability'], gender_names['probability'])
gender_names.head(-10)
| name | gender | probability | prob_woman | |
|---|---|---|---|---|
| 0 | Hugo | man | 0.99 | 0.01 |
| 1 | Ian | man | 0.99 | 0.01 |
| 2 | Preetha | woman | 0.96 | 0.96 |
| 3 | Silvia | woman | 0.99 | 0.99 |
| 4 | Zhihuan | unknown | 0.00 | 0.00 |
| ... | ... | ... | ... | ... |
| 25187 | Naduparambil | unknown | 0.00 | 0.00 |
| 25188 | Sheree | woman | 0.96 | 0.96 |
| 25189 | Gussie | woman | 1.00 | 1.00 |
| 25190 | Jouko | man | 1.00 | 0.00 |
| 25191 | Zuyun | man | 1.00 | 0.00 |
25192 rows × 4 columns
# Create new var with contributors first names
authors['name'] = authors['Author name'].apply(get_first)
reviewers['name'] = reviewers['Reviewer name'].apply(get_first)
# Merge gender data on names
authors = authors.merge(gender_names, how="left")
reviewers = reviewers.merge(gender_names, how="left")
agg_authors = authors.drop_duplicates(subset=["Author ID"])
print("-"*22)
print("AUTHORS PER COUNTRY")
print("Total Authors = {}".format(agg_authors.shape[0]))
print("-"*22)
pd.DataFrame(round(agg_authors['Country'].value_counts(normalize=True)[:10], 2))
---------------------- AUTHORS PER COUNTRY Total Authors = 108276 ----------------------
| Country | |
|---|---|
| United States | 0.40 |
| United Kingdom | 0.10 |
| Germany | 0.08 |
| China | 0.06 |
| France | 0.05 |
| Japan | 0.03 |
| Canada | 0.03 |
| Australia | 0.02 |
| Switzerland | 0.02 |
| Netherlands | 0.02 |
print("-"*22)
print("AUTHORS PER REGION")
print("Total Authors = {}".format(agg_authors.shape[0]))
print("-"*22)
pd.DataFrame(round(agg_authors['a_region'].value_counts(normalize=True)[:10], 2))
---------------------- AUTHORS PER REGION Total Authors = 108276 ----------------------
| a_region | |
|---|---|
| Americas | 0.44 |
| Europe | 0.36 |
| Asia | 0.15 |
| Oceania | 0.02 |
| unknown | 0.02 |
| Africa | 0.01 |
| Antarctic | 0.00 |
print("-"*28)
print("AUTHORS IN COMMONWEALTH (=1)")
print("Total Authors = {}".format(agg_authors.shape[0]))
print("-"*28)
pd.DataFrame(round(agg_authors['a_commonwealth'].value_counts(normalize=True)[:10], 2))
---------------------------- AUTHORS IN COMMONWEALTH (=1) Total Authors = 108276 ----------------------------
| a_commonwealth | |
|---|---|
| 1 | 0.56 |
| 0 | 0.44 |
print("-"*22)
print("AUTHOR GENDERS")
print("Total Authors = {}".format(agg_authors.shape[0]))
print("-"*22)
pd.DataFrame(round(agg_authors['gender'].value_counts(normalize=True)[:10], 2))
---------------------- AUTHOR GENDERS Total Authors = 108276 ----------------------
| gender | |
|---|---|
| man | 0.61 |
| woman | 0.36 |
| unknown | 0.03 |
def get_ISO(s):
"""
Takes a string of author-input country;
Tries to return ISO code, with exceptions.
"""
bad_countries = []
if type(s) == str:
s = s.split("[")[0]
if "Korea" in s:
results = pycountry.countries.search_fuzzy("korea")
return results[1].alpha_3
if "Taiwan" in s:
results = pycountry.countries.search_fuzzy("taiwan")
return results[0].alpha_3
if "Serbia" in s:
results = pycountry.countries.search_fuzzy("serbia")
return results[0].alpha_3
if "Burma" in s:
results = pycountry.countries.search_fuzzy("myanmar")
return results[0].alpha_3
if "Laos" in s:
results = pycountry.countries.search_fuzzy("lao")
return results[0].alpha_3
if "Democratic" and "Congo" in s:
results = pycountry.countries.search_fuzzy("congo")
return results[1].alpha_3
if "Swaziland" in s:
results = pycountry.countries.get(name="Eswatini")
return results.alpha_3
if "d'Ivoire" in s:
results = pycountry.countries.search_fuzzy("d'ivoire")
return results[0].alpha_3
if "Netherlands" in s:
results = pycountry.countries.search_fuzzy("Netherlands")
return results[0].alpha_3
if "China" in s:
results = pycountry.countries.search_fuzzy("china")
return results[0].alpha_3
else:
try:
results = pycountry.countries.search_fuzzy(s)
return results[0].alpha_3
except:
print("{} not in pycountry".format(s))
else:
return None
# get dict of all countries, as authors input them
countries = {}
for country in agg_authors['Country']:
if country not in countries:
countries[country] = None
# loop up each country once
for country in countries:
countries[country] = get_ISO(country)
# retrieve ISO code
agg_authors["ADM0_A3"] = agg_authors["Country"].apply(lambda x: countries[x])
thierry.gallopin@espci.fr not in pycountry Macau not in pycountry bryan.wilkins@biologie.uni-goettingen.de not in pycountry Ken.Dunn@UHSM.NHS.UK not in pycountry Réunion not in pycountry
# author countries df
shapefile = glob.glob(RD+'*.shp')[0]
gdf = gpd.read_file(shapefile)[['ADM0_A3', 'geometry']].to_crs('+proj=robin')
authors_countries = agg_authors[["ADM0_A3"]]
authors_countries["country_count"] = agg_authors.groupby('ADM0_A3')["ADM0_A3"].transform("count")
authors_countries = authors_countries.drop_duplicates()
authors_countries = gdf.merge(authors_countries, on="ADM0_A3")
# Plot Distribution of Authors across World
n_bins = 10
cmap = 'Blues'
figsize = (20, 10)
title = 'Countries Represented by N Authors Submitting to eLife'
ax = authors_countries.dropna().plot(column="country_count",
cmap=cmap,
figsize=figsize,
scheme='quantiles',
k=n_bins,
legend=True)
ax.set_title(title, fontdict={'fontsize': 20}, loc='center')
ax.set_axis_off()
ax.set_xlim([-1.5e7, 1.7e7])
ax.get_legend().set_bbox_to_anchor((.12, .4))
print("-"*32)
print("CORRESPONDING AUTHORS' COUNTRIES")
print("-"*32)
pd.DataFrame(round(agg_authors['Country'][agg_authors['Author type'] == "Corresponding Author"].value_counts(normalize=True)[:10], 2))
-------------------------------- CORRESPONDING AUTHORS' COUNTRIES --------------------------------
| Country | |
|---|---|
| United States | 0.42 |
| United Kingdom | 0.10 |
| Germany | 0.09 |
| China | 0.05 |
| France | 0.05 |
| Japan | 0.03 |
| Canada | 0.03 |
| Switzerland | 0.02 |
| Netherlands | 0.02 |
| Australia | 0.02 |
print("-"*40)
print("CORRESPONDING AUTHORS FROM COMMONWEALTH")
print("-"*40)
pd.DataFrame(round(agg_authors['a_commonwealth'][agg_authors['Author type'] == "Corresponding Author"].value_counts(normalize=True)[:10], 2))
---------------------------------------- CORRESPONDING AUTHORS FROM COMMONWEALTH ----------------------------------------
| a_commonwealth | |
|---|---|
| 1 | 0.59 |
| 0 | 0.41 |
print("-"*33)
print("CORRESPONDING AUTHORS FROM ASIA")
print("-"*33)
pd.DataFrame(round(agg_authors['a_asia'][agg_authors['Author type'] == "Corresponding Author"].value_counts(normalize=True)[:10], 2))
--------------------------------- CORRESPONDING AUTHORS FROM ASIA ---------------------------------
| a_asia | |
|---|---|
| 0 | 0.86 |
| 1 | 0.14 |
print("-"*35)
print("CORRESPONDING AUTHOR GENDERS")
print("Total Corresponding Authors = {}".format(authors[authors['Author type'] == "Corresponding Author"].shape[0]))
print("-"*35)
pd.DataFrame(round(agg_authors['gender'][agg_authors['Author type'] == "Corresponding Author"].value_counts(normalize=True)[:10], 2))
----------------------------------- CORRESPONDING AUTHOR GENDERS Total Corresponding Authors = 21753 -----------------------------------
| gender | |
|---|---|
| man | 0.69 |
| woman | 0.28 |
| unknown | 0.03 |
# author countries df
shapefile = glob.glob(RD+'*.shp')[0]
gdf = gpd.read_file(shapefile)[['ADM0_A3', 'geometry']].to_crs('+proj=robin')
corr_auth_countries = agg_authors[["ADM0_A3"]][agg_authors['Author type'] == "Corresponding Author"]
corr_auth_countries["country_count"] = agg_authors[agg_authors['Author type'] == "Corresponding Author"].groupby('ADM0_A3')["ADM0_A3"].transform("count")
corr_auth_countries = corr_auth_countries.drop_duplicates()
corr_auth_countries = gdf.merge(corr_auth_countries, on="ADM0_A3")
n_bins = 8
cmap = 'Greens'
figsize = (20, 10)
title = 'Countries Represented by N CORRESPONDING Authors Submitting to eLife'
ax = corr_auth_countries.dropna().plot(column="country_count",
cmap=cmap,
figsize=figsize,
scheme='quantiles',
k=n_bins,
legend=True)
ax.set_title(title, fontdict={'fontsize': 20}, loc='center')
ax.set_axis_off()
ax.set_xlim([-1.5e7, 1.7e7])
ax.get_legend().set_bbox_to_anchor((.12, .4))
The countries of reviewers who have email TLDs .com or .org are classified as "unknown" since these are not linked up to a specific country.
agg_reviewers = reviewers.drop_duplicates(subset=["Reviewer ID"])
print("-"*32)
print("REVIEWERS PER COUNTRY (Top 10)")
print("Total Reviewers = {}".format(agg_reviewers.shape[0]))
print("-"*32)
pd.DataFrame(round(agg_reviewers['r_country'].value_counts(normalize=True)[:10],2))
-------------------------------- REVIEWERS PER COUNTRY (Top 10) Total Reviewers = 22942 --------------------------------
| r_country | |
|---|---|
| United States | 0.45 |
| unknown | 0.10 |
| United Kingdom | 0.10 |
| Germany | 0.07 |
| Saint Martin | 0.04 |
| Canada | 0.04 |
| Switzerland | 0.02 |
| Japan | 0.02 |
| Sint Eustatius | 0.02 |
| Australia | 0.02 |
print("-"*32)
print("REVIEWERS PER REGION")
print("Total Reviewers = {}".format(agg_reviewers.shape[0]))
print("-"*32)
pd.DataFrame(round(agg_reviewers['r_region'].value_counts(normalize=True)[:10],2))
-------------------------------- REVIEWERS PER REGION Total Reviewers = 22942 --------------------------------
| r_region | |
|---|---|
| Americas | 0.55 |
| Europe | 0.26 |
| unknown | 0.12 |
| Asia | 0.05 |
| Oceania | 0.02 |
| Africa | 0.00 |
print("-"*36)
print("REVIEWERS FROM COMMONWEALTH (=1)")
print("Total Reviewers = {}".format(agg_reviewers.shape[0]))
print("-"*36)
pd.DataFrame(round(agg_reviewers['r_commonwealth'].value_counts(normalize=True)[:10], 2))
------------------------------------ REVIEWERS FROM COMMONWEALTH (=1) Total Reviewers = 22942 ------------------------------------
| r_commonwealth | |
|---|---|
| 1 | 0.61 |
| 0 | 0.39 |
print("-"*30)
print("REVIEWER GENDERS")
print("Total Reviewers = {}".format(agg_reviewers.shape[0]))
print("-"*30)
pd.DataFrame((round(agg_reviewers['gender'].value_counts(normalize=True)[:10], 2)))
------------------------------ REVIEWER GENDERS Total Reviewers = 22942 ------------------------------
| gender | |
|---|---|
| man | 0.73 |
| woman | 0.25 |
| unknown | 0.02 |
# get dict of all countries, as authors input them
countries = {}
for country in agg_reviewers['r_country']:
if country not in countries:
countries[country] = None
# loop up each country once
for country in countries:
countries[country] = get_ISO(country)
# retrieve ISO code
agg_reviewers["ADM0_A3"] = agg_reviewers["r_country"].apply(lambda x: countries[x])
unknown not in pycountry European Union not in pycountry Ascension Island not in pycountry Basque Country not in pycountry Catalonia not in pycountry Macau not in pycountry
# author countries df
shapefile = glob.glob(RD+'*.shp')[0]
gdf = gpd.read_file(shapefile)[['ADM0_A3', 'geometry']].to_crs('+proj=robin')
reviewers_countries = agg_reviewers[["ADM0_A3"]]
reviewers_countries["country_count"] = agg_reviewers.groupby('ADM0_A3')["ADM0_A3"].transform("count")
reviewers_countries = reviewers_countries.drop_duplicates()
reviewers_countries = gdf.merge(reviewers_countries, on="ADM0_A3")
n_bins = 8
cmap = 'Reds'
figsize = (20, 10)
title = 'Countries Represented by N Reviewers of eLife'
ax = corr_auth_countries.dropna().plot(column="country_count",
cmap=cmap,
figsize=figsize,
scheme='quantiles',
k=n_bins,
legend=True)
ax.set_title(title, fontdict={'fontsize': 20}, loc='center')
ax.set_axis_off()
ax.set_xlim([-1.5e7, 1.7e7])
ax.get_legend().set_bbox_to_anchor((.12, .4))
These stats jointly describe the authors collaborating on a given manuscript.
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(authors.groupby('Manuscript no.')['a_asia'].mean(),
bins=75,
range=[0,1],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('Prop. of Authors from Asia per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(ticks=[0,.25,.5,.75,1],
fontsize=25,
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(authors.groupby('Manuscript no.')['a_commonwealth'].mean(),
bins=75,
range=[0,1],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('Prop. of Authors from\nCommonwealth Countries per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(ticks=[0,.25,.5,.75,1],
fontsize=25,
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(authors.groupby('Manuscript no.')['country_diversity'].mean(),
bins=75,
range=[0,1],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('Prop. of Unique Countries\n Represented by Authors per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(ticks=[0,.25,.5,.75,1],
fontsize=25,
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.text(.05, 12500, "0 = no unique countries", fontsize=20)
plt.text(.55, 1500, "all unique countries = 1", fontsize=20)
plt.show()
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'gainsboro'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
plt.style.use('seaborn-white')
plt.setp(ax.spines.values(), color=data_color)
plt.hist(authors.groupby('Manuscript no.')['prob_woman'].mean(),
bins=75,
range=[0,1],
edgecolor=edge_color,
facecolor=data_color,
linewidth=1.5)
plt.xlabel('Prop. of Women Authors per MS',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('N\nManuscripts',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(ticks=[0,.25,.5,.75,1],
fontsize=25,
color=tick_color,
weight=font_weight)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.show()
manuscripts = manuscripts.rename(columns={"ms": "Manuscript no."})
manuscripts["Manuscript no."] = manuscripts["Manuscript no."].apply(lambda x: int(x))
authors["Manuscript no."] = authors["Manuscript no."].apply(lambda x: int(x))
authors_outcomes = authors.merge(manuscripts, how="left", on="Manuscript no.")
print("-"*45)
print("INITIAL DECISION BY AUTHOR IN AMERICAS (=1):")
print("-"*45)
round(pd.crosstab(authors_outcomes['a_america'],
authors_outcomes['initial_decision'],
normalize='index')*100,2)
--------------------------------------------- INITIAL DECISION BY AUTHOR IN AMERICAS (=1): ---------------------------------------------
| initial_decision | Encourage Full Submission | Reject Initial Submission | Simple Withdraw |
|---|---|---|---|
| a_america | |||
| 0 | 97.90 | 2.07 | 0.03 |
| 1 | 97.97 | 2.03 | 0.00 |
print("-"*45)
print("INITIAL DECISION BY AUTHOR IN ASIA (=1):")
print("-"*45)
round(pd.crosstab(authors_outcomes['a_asia'],
authors_outcomes['initial_decision'],
normalize='index')*100,2)
--------------------------------------------- INITIAL DECISION BY AUTHOR IN ASIA (=1): ---------------------------------------------
| initial_decision | Encourage Full Submission | Reject Initial Submission | Simple Withdraw |
|---|---|---|---|
| a_asia | |||
| 0 | 97.85 | 2.15 | 0.00 |
| 1 | 98.44 | 1.43 | 0.13 |
print("-"*45)
print("INITIAL DECISION BY AUTHOR GENDER:")
print("-"*45)
round(pd.crosstab(authors_outcomes['gender'],
authors_outcomes['initial_decision'],
normalize='index')*100,2)
--------------------------------------------- INITIAL DECISION BY AUTHOR GENDER: ---------------------------------------------
| initial_decision | Encourage Full Submission | Reject Initial Submission | Simple Withdraw |
|---|---|---|---|
| gender | |||
| man | 97.95 | 2.04 | 0.02 |
| unknown | 98.05 | 1.92 | 0.02 |
| woman | 97.89 | 2.09 | 0.02 |
Africa/Antartica have v. few authors.
print("-"*45)
print("INITIAL DECISION BY REGION:")
print("-"*45)
round(pd.crosstab(authors_outcomes['a_region'],
authors_outcomes['initial_decision'],
normalize='index')*100, 2)
--------------------------------------------- INITIAL DECISION BY REGION: ---------------------------------------------
| initial_decision | Encourage Full Submission | Reject Initial Submission | Simple Withdraw |
|---|---|---|---|
| a_region | |||
| Africa | 98.04 | 1.96 | 0.00 |
| Americas | 97.97 | 2.03 | 0.00 |
| Antarctic | 100.00 | 0.00 | 0.00 |
| Asia | 98.44 | 1.43 | 0.13 |
| Europe | 97.67 | 2.33 | 0.00 |
| Oceania | 98.68 | 1.32 | 0.00 |
| unknown | 97.51 | 2.49 | 0.00 |
print("-"*54)
print("FINAL DECISION BY AUTHORS IN THE AMERICAS (=1):")
print("-"*54)
round(pd.crosstab(authors_outcomes['a_america'],
authors_outcomes['full_decision'],
normalize='index')*100,2)
------------------------------------------------------ FINAL DECISION BY AUTHORS IN THE AMERICAS (=1): ------------------------------------------------------
| full_decision | Accept Full Submission | Reject Full Submission | Revise Full Submission | Simple Withdraw |
|---|---|---|---|---|
| a_america | ||||
| 0 | 1.19 | 45.04 | 53.47 | 0.30 |
| 1 | 1.76 | 36.73 | 61.03 | 0.48 |
print("-"*54)
print("FINAL DECISION BY AUTHORS IN ASIA (=1):")
print("-"*54)
round(pd.crosstab(authors_outcomes['a_asia'],
authors_outcomes['full_decision'],
normalize='index')*100,2)
------------------------------------------------------ FINAL DECISION BY AUTHORS IN ASIA (=1): ------------------------------------------------------
| full_decision | Accept Full Submission | Reject Full Submission | Revise Full Submission | Simple Withdraw |
|---|---|---|---|---|
| a_asia | ||||
| 0 | 1.55 | 39.68 | 58.40 | 0.36 |
| 1 | 0.78 | 51.13 | 47.57 | 0.53 |
print("-"*54)
print("FINAL DECISION BY AUTHOR GENDER:")
print("-"*54)
round(pd.crosstab(authors_outcomes['gender'],
authors_outcomes['full_decision'],
normalize='index')*100,2)
------------------------------------------------------ FINAL DECISION BY AUTHOR GENDER: ------------------------------------------------------
| full_decision | Accept Full Submission | Reject Full Submission | Revise Full Submission | Simple Withdraw |
|---|---|---|---|---|
| gender | ||||
| man | 1.44 | 40.84 | 57.32 | 0.40 |
| unknown | 1.03 | 47.06 | 51.62 | 0.29 |
| woman | 1.51 | 41.53 | 56.60 | 0.36 |
print("-"*54)
print("FINAL DECISION BY AUTHOR WORLD REGION:")
print("-"*54)
round(pd.crosstab(authors_outcomes['a_region'],
authors_outcomes['full_decision'],
normalize='index')*100, 2)
------------------------------------------------------ FINAL DECISION BY AUTHOR WORLD REGION: ------------------------------------------------------
| full_decision | Accept Full Submission | Reject Full Submission | Revise Full Submission | Simple Withdraw |
|---|---|---|---|---|
| a_region | ||||
| Africa | 0.00 | 34.96 | 65.04 | 0.00 |
| Americas | 1.76 | 36.73 | 61.03 | 0.48 |
| Antarctic | 0.00 | 0.00 | 100.00 | 0.00 |
| Asia | 0.78 | 51.13 | 47.57 | 0.53 |
| Europe | 1.43 | 42.12 | 56.21 | 0.23 |
| Oceania | 0.47 | 42.35 | 57.12 | 0.06 |
| unknown | 0.47 | 64.74 | 34.39 | 0.39 |
reviews["Manuscript no."] = reviews["Manuscript no."].apply(lambda x: int(x))
agg_authors["Manuscript no."] = agg_authors["Manuscript no."].apply(lambda x: int(x))
reviews_authors = reviews
reviews_authors = reviews_authors.merge(agg_authors, on="Manuscript no.")
print("-"*54)
print("RATINGS BY AUTHOR REGION *")
print(" * counts are review-author combos, i.e. a single \n review can be paired with n authors per ms")
print("-"*54)
round(reviews_authors.groupby('a_region')['std_rating_hat'].describe(),2)
------------------------------------------------------
RATINGS BY AUTHOR REGION *
* counts are review-author combos, i.e. a single
review can be paired with n authors per ms
------------------------------------------------------
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| a_region | ||||||||
| Africa | 1466.0 | 0.02 | 0.96 | -2.76 | -0.69 | 0.12 | 0.57 | 2.09 |
| Americas | 111944.0 | 0.04 | 0.97 | -2.95 | -0.59 | 0.15 | 0.71 | 2.74 |
| Antarctic | 2.0 | -0.30 | 1.89 | -1.64 | -0.97 | -0.30 | 0.37 | 1.04 |
| Asia | 34583.0 | -0.23 | 1.01 | -2.80 | -0.97 | -0.13 | 0.48 | 2.60 |
| Europe | 89794.0 | -0.06 | 0.98 | -2.95 | -0.74 | 0.03 | 0.63 | 2.77 |
| Oceania | 5929.0 | -0.01 | 0.99 | -2.79 | -0.70 | 0.11 | 0.69 | 2.55 |
| unknown | 4251.0 | -0.33 | 0.98 | -2.79 | -1.09 | -0.30 | 0.44 | 2.60 |
print("-"*34)
print("RATINGS BY CORRES. AUTHOR REGION")
print(" * counts = n reviews with\n auth. characteristic")
print("-"*34)
round(reviews_authors[reviews_authors['Author type'] == "Corresponding Author"].groupby("a_region")['std_rating_hat'].describe(), 2)
----------------------------------
RATINGS BY CORRES. AUTHOR REGION
* counts = n reviews with
auth. characteristic
----------------------------------
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| a_region | ||||||||
| Africa | 57.0 | -0.25 | 1.08 | -2.76 | -1.06 | -0.02 | 0.45 | 1.41 |
| Americas | 11800.0 | -0.01 | 0.97 | -2.91 | -0.66 | 0.10 | 0.67 | 2.51 |
| Asia | 3203.0 | -0.32 | 1.00 | -2.80 | -1.06 | -0.23 | 0.40 | 2.48 |
| Europe | 9310.0 | -0.11 | 0.98 | -2.95 | -0.79 | -0.01 | 0.58 | 2.74 |
| Oceania | 518.0 | -0.06 | 1.00 | -2.69 | -0.78 | 0.05 | 0.63 | 2.38 |
| unknown | 206.0 | -0.33 | 1.02 | -2.65 | -1.03 | -0.25 | 0.43 | 2.07 |
print("-"*41)
print("RATINGS BY CORRES. AUTHOR IN COMMONWEALTH")
print(" * counts = n reviews with\n auth. characteristic")
print("-"*41)
round(reviews_authors[reviews_authors['Author type'] == "Corresponding Author"].groupby("a_commonwealth")['std_rating_hat'].describe(), 2)
-----------------------------------------
RATINGS BY CORRES. AUTHOR IN COMMONWEALTH
* counts = n reviews with
auth. characteristic
-----------------------------------------
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| a_commonwealth | ||||||||
| 0 | 10052.0 | -0.18 | 0.99 | -2.85 | -0.89 | -0.07 | 0.52 | 2.74 |
| 1 | 15042.0 | -0.03 | 0.98 | -2.95 | -0.69 | 0.08 | 0.66 | 2.51 |
Note this is asking: Do diverser manuscripts get better or worse reviews, on average. No relationship.
print("-"*35)
print("CORR. BETWEEN COUNTRY DIVERSITY OF\nCOAUTHORS & AVG REVIEWER RATING:")
print("-"*35)
corr = scipy.stats.spearmanr(reviews_authors.groupby('Manuscript no.')['country_diversity'].mean(),
reviews_authors.groupby('Manuscript no.')['std_rating_hat'].mean(), nan_policy="omit")
print("r = {}\np = {}".format(round(corr[0],2), round(corr[1],3)))
print("-"*35)
----------------------------------- CORR. BETWEEN COUNTRY DIVERSITY OF COAUTHORS & AVG REVIEWER RATING: ----------------------------------- r = -0.03 p = 0.001 -----------------------------------
print("-"*34)
print("RATINGS BY AUTHOR GENDER *")
print(" * counts = n reviews with\n auth. characteristic")
print("-"*34)
round(reviews_authors.groupby('gender')['std_rating_hat'].describe(),2)
----------------------------------
RATINGS BY AUTHOR GENDER *
* counts = n reviews with
auth. characteristic
----------------------------------
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| gender | ||||||||
| man | 150971.0 | -0.04 | 0.99 | -2.95 | -0.70 | 0.07 | 0.65 | 2.77 |
| unknown | 8125.0 | -0.13 | 1.01 | -2.95 | -0.85 | -0.03 | 0.57 | 2.60 |
| woman | 87752.0 | -0.04 | 0.98 | -2.95 | -0.71 | 0.07 | 0.64 | 2.77 |
print("-"*34)
print("RATINGS BY CORRES. AUTHOR GENDER")
print(" * counts = n reviews with\n auth. characteristic")
print("-"*34)
round(reviews_authors[reviews_authors['Author type'] == "Corresponding Author"].groupby("gender")['std_rating_hat'].describe(), 2)
----------------------------------
RATINGS BY CORRES. AUTHOR GENDER
* counts = n reviews with
auth. characteristic
----------------------------------
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| gender | ||||||||
| man | 17326.0 | -0.08 | 0.98 | -2.91 | -0.74 | 0.04 | 0.62 | 2.74 |
| unknown | 761.0 | -0.23 | 1.03 | -2.74 | -1.04 | -0.17 | 0.51 | 2.20 |
| woman | 6784.0 | -0.12 | 0.98 | -2.95 | -0.81 | -0.02 | 0.57 | 2.52 |
print("-"*33)
print("CORR. BETWEEN GENDER DIVERSITY OF\nCOAUTHORS & AVG REVIEWER RATING:")
print("-"*33)
corr = scipy.stats.spearmanr(reviews_authors.groupby('Manuscript no.')['prob_woman'].mean(),
reviews_authors.groupby('Manuscript no.')['std_rating_hat'].mean(), nan_policy="omit")
print("r = {}\np = {}".format(round(corr[0],2), round(corr[1],3)))
print("-"*33)
--------------------------------- CORR. BETWEEN GENDER DIVERSITY OF COAUTHORS & AVG REVIEWER RATING: --------------------------------- r = -0.01 p = 0.147 ---------------------------------
print("-"*35)
print("CORR. BETWEEN LENGTH AND RATING:")
corr = scipy.stats.spearmanr(reviews['lengths'], reviews['std_rating_hat'])
print("r = {}\np = {}".format(round(corr[0],2), round(corr[1],3)))
print("-"*35)
----------------------------------- CORR. BETWEEN LENGTH AND RATING: r = -0.28 p = 0.0 -----------------------------------
# Code found and adapted from:
# https://stackoverflow.com/questions/27164114/show-confidence-limits-and-prediction-limits-in-scatter-plot
# Helper functions -------------------------------------------------------------
def equation(a, b):
"""Return a 1D polynomial."""
return np.polyval(a, b)
def plot_ci_manual(t, s_err, n, x, x2, y2, ax=None):
"""Return an axes of confidence bands using a simple approach.
Notes
-----
.. math:: \left| \: \hat{\mu}_{y|x0} - \mu_{y|x0} \: \right| \; \leq \; T_{n-2}^{.975} \; \hat{\sigma} \; \sqrt{\frac{1}{n}+\frac{(x_0-\bar{x})^2}{\sum_{i=1}^n{(x_i-\bar{x})^2}}}
.. math:: \hat{\sigma} = \sqrt{\sum_{i=1}^n{\frac{(y_i-\hat{y})^2}{n-2}}}
References
----------
.. [1] M. Duarte. "Curve fitting," Jupyter Notebook.
http://nbviewer.ipython.org/github/demotu/BMC/blob/master/notebooks/CurveFitting.ipynb
"""
if ax is None:
ax = plt.gca()
ci = t * s_err * np.sqrt(1/n + (x2 - np.mean(x))**2 / np.sum((x - np.mean(x))**2))
ax.fill_between(x2, y2 + ci, y2 - ci, color=tick_color, edgecolor=None)
return ax
# Computations ----------------------------------------------------------------
x = reviews['lengths']
y = reviews['std_rating_hat']
p, cov = np.polyfit(x, y, 1, cov=True) # parameters and covariance from of the fit of 1-D polynom.
y_model = equation(p, x) # model using the fit parameters; NOTE: parameters here are coefficients
# Statistics
n = reviews['std_rating_hat'].size # number of observations
m = p.size # number of parameters
dof = n - m # degrees of freedom
t = scipy.stats.t.ppf(0.95, n - m) # used for CI and PI bands
# Estimates of Error in Data/Model
resid = y - y_model
chi2 = np.sum((resid / y_model)**2) # chi-squared; estimates error in data
chi2_red = chi2 / dof # reduced chi-squared; measures goodness of fit
s_err = np.sqrt(np.sum(resid**2) / dof) # standard deviation of the error
# Plotting --------------------------------------------------------------------
# Plot globals
title_color, tick_color, edge_color, data_color = 'black', 'darkgray', 'dimgray', 'whitesmoke'
font_size = 25
font_weight = "bold"
fig, ax = plt.subplots()
# Data
ax.plot(x, y, "o",
color=data_color,
markersize=.5,
markeredgewidth=.5,
markeredgecolor=edge_color,
markerfacecolor=data_color
)
# Linear Fit
ax.plot(x, y_model, "-",
color=title_color,
linewidth=1,
label="Fit")
# Confidence Interval
x2 = np.linspace(np.min(x), np.max(x), 100)
y2 = equation(p, x2)
plot_ci_manual(t, s_err, n, x, x2, y2, ax=ax)
# # Prediction Interval
pi = t * s_err * np.sqrt(1 + 1/n + (x2 - np.mean(x))**2 / np.sum((x - np.mean(x))**2))
ax.fill_between(x2, y2 + pi, y2 - pi,
color="None",
linestyle="--",
linewidth=3)
ax.plot(x2, y2 - pi, "--",
color=title_color,
label="95% Pred. Lim.")
ax.plot(x2, y2 + pi, "--",
color=title_color)
# Custom legend
handles, labels = ax.get_legend_handles_labels()
display = (0, 1)
anyArtist = plt.Line2D((0, 1), (1, 0), color=tick_color) # create custom artists
legend = plt.legend(
[handle for i, handle in enumerate(handles) if i in display] + [anyArtist],
[label for i, label in enumerate(labels) if i in display] + ["95% CI"],
loc=9, bbox_to_anchor=(1, .6, .5, 0.), ncol=1, mode="expand", fontsize=20)
frame = legend.get_frame().set_edgecolor(tick_color)
# Annotation with r
plt.text(2200, 2.4, 'r = {}'.format(round(corr[0],2)), fontsize=25)
# Styling
plt.xlabel('N Tokens per Review',
fontsize=font_size,
labelpad=25,
color=title_color,
weight=font_weight)
plt.ylabel('Predicted\nRating',
fontsize=font_size,
rotation=0,
labelpad=90,
color=title_color,
weight=font_weight)
plt.grid(b=True,
color=tick_color,
axis="y",
alpha=0.8,
linestyle=':',
linewidth=1)
plt.xticks(fontsize=25,
color=tick_color,
weight=font_weight,
rotation = 60)
plt.yticks(fontsize=25,
color=tick_color,
weight=font_weight)
plt.xlim(0, 3000)
plt.ylim(-3, 3)
plt.show()
reviews_reviewers = reviews
reviews_reviewers = reviews_reviewers.merge(reviewers, how="left", on="Reviewer ID")
print("-"*28)
print("RATINGS BY REVIEWER REGION")
print("-"*28)
round(reviews_reviewers.groupby('r_region')['std_rating_hat'].describe(),2)
---------------------------- RATINGS BY REVIEWER REGION ----------------------------
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| r_region | ||||||||
| Africa | 679.0 | 0.23 | 0.75 | -2.42 | -0.29 | 0.32 | 0.86 | 2.12 |
| Americas | 245146.0 | 0.06 | 1.02 | -2.95 | -0.60 | 0.17 | 0.78 | 2.77 |
| Asia | 15589.0 | 0.15 | 0.95 | -2.68 | -0.45 | 0.24 | 0.80 | 2.55 |
| Europe | 87526.0 | 0.02 | 0.99 | -2.89 | -0.63 | 0.12 | 0.68 | 2.56 |
| Oceania | 3324.0 | -0.14 | 1.05 | -2.61 | -1.00 | -0.12 | 0.60 | 2.60 |
| unknown | 37036.0 | 0.02 | 0.99 | -2.93 | -0.61 | 0.13 | 0.67 | 2.50 |
print("-"*40)
print("RATINGS BY REVIEWER IN COMMONWEALTH")
print("-"*40)
round(reviews_reviewers.groupby('r_commonwealth')['std_rating_hat'].describe(),2)
---------------------------------------- RATINGS BY REVIEWER IN COMMONWEALTH ----------------------------------------
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| r_commonwealth | ||||||||
| 0.0 | 111994.0 | 0.06 | 0.98 | -2.93 | -0.55 | 0.15 | 0.70 | 2.74 |
| 1.0 | 277306.0 | 0.05 | 1.02 | -2.95 | -0.62 | 0.15 | 0.77 | 2.77 |
print("-"*34)
print("RATINGS BY REVIEWER GENDER")
print("-"*34)
round(reviews_reviewers.groupby('gender')['std_rating_hat'].describe(),2)
---------------------------------- RATINGS BY REVIEWER GENDER ----------------------------------
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| gender | ||||||||
| man | 301162.0 | 0.05 | 1.02 | -2.95 | -0.61 | 0.15 | 0.75 | 2.77 |
| unknown | 7525.0 | 0.26 | 0.87 | -2.40 | -0.26 | 0.38 | 0.86 | 2.55 |
| woman | 80187.0 | 0.05 | 1.00 | -2.89 | -0.57 | 0.14 | 0.73 | 2.59 |
reviews_reviewers[reviews_reviewers["gender"] == "unknown"][["name", "gender", "r_country"]].drop_duplicates(keep="first", inplace=False)
| name | gender | r_country | |
|---|---|---|---|
| 961 | Xiao-Fan | unknown | United States |
| 1346 | Tzumin | unknown | unknown |
| 2609 | Taekjip | unknown | United States |
| 2634 | Leemor | unknown | United States |
| 4073 | Panisadee | unknown | Thailand |
| ... | ... | ... | ... |
| 384211 | Dr. | unknown | United Kingdom |
| 387186 | McKell | unknown | unknown |
| 387212 | Chuanji | unknown | Sint Eustatius |
| 387883 | Houfeng | unknown | China, People’s Republic of |
| 389022 | Aymelt | unknown | Germany |
323 rows × 3 columns